Title
summary
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
import load_covid_data
from IPython.display import display, Markdown
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import arviz as az
import pymc3 as pm
import altair as alt
sns.set_context('talk')
plt.style.use('seaborn-whitegrid')
debug=False
df = load_covid_data.load_data(drop_states=True, filter_n_days_100=2)
df = df.loc[lambda x: x.country != 'Chinal (total)']
countries = df.country.unique()
n_countries = len(countries)
df = df.loc[lambda x: (x.days_since_100 >= 0)]
df.index.max().strftime('%B %d, %Y')
annotate_kwargs = dict(
s='Dateset by COVID Data Repository by Johns Hopkins CSSE ({})\nBy Guilherme Diaz-Berrio, based on the work by Thomas Wiecki'.format(df.index.max().strftime('%B %d, %Y')),
xy=(0.05, 0.01), xycoords='figure fraction', fontsize=10)
annotate_kwargs
', '.join(countries.tolist())
with pm.Model() as exp_model:
# Intercept
a_grp = pm.Normal('a_grp', 100, 50) # Group Mean
a_grp_sigma = pm.HalfNormal('a_grp_sigma', 50) # Group Variance
a_ind = pm.Normal('a_ind', mu=a_grp, sigma=a_grp_sigma, shape=n_countries) # Individual Intercepts
# Slope
b_grp = pm.Normal('b_grp', 1.33, 0.5) # Group Mean
b_grp_sigma = pm.HalfNormal('b_grp_sigma', .5) # Group Variance
b_ind = pm.Normal('b_ind', mu=b_grp, sigma=b_grp_sigma, shape=n_countries) # Individual slopes
# Error
sigma = pm.HalfNormal('sigma', 500., shape=n_countries)
# Create likelihood for each country
for i, country in enumerate(countries):
df_country = df.loc[lambda x: (x.country == country)]
# By using pm.Data we can change these values after sampling.
# This allows us to extend x into the future so we can get
# forecasts by sampling from the posterior predictive
x = pm.Data(country + "x_data", df_country.days_since_100.values)
confirmed = pm.Data(country + "y_data", df_country.confirmed.astype('float64').values)
# Likelihood
pm.NegativeBinomial(
country,
(a_ind[i] * b_ind[i] ** x), # Exponential Regression
sigma[i],
observed=confirmed)
with exp_model:
trace = pm.sample(tune=1500, chains=2, cores=1, target_accept=.9)
# update data so we get predictions into the future
for country in countries:
df_country = df.loc[lambda x: (x.country == country)]
x_data = np.arange(0, 45)
y_data = np.array([np.nan] * len(x_data))
pm.set_data({country + "x_data": x_data})
pm.set_data({country + "y_data": y_data})
post_pred = pm.sample_posterior_predictive(trace, samples=80)
# flatten predictions & target for each country into a pandas DataFrame
predictions_dfs_list = []
for country in post_pred:
arr = post_pred[country]
preds = arr.flatten().tolist() # get predictions in flattened array
pred_idx = np.indices(arr.shape)[0].flatten().tolist() # predictions for the model (grey lines)
days_since = np.indices(arr.shape)[1].flatten().tolist() # days since 100 cases
pred_df = pd.DataFrame(
{
'country': country,
'predictions': preds,
'pred_idx': pred_idx,
'days_since_100': days_since
}
)
predictions_dfs_list.append(pred_df)
predictionsDF = pd.concat(predictions_dfs_list)
# Compute the maximum value to plot on the y-axis as 15x the last confirmed case
ylims = pd.DataFrame(df.groupby('country').last().confirmed * 15).reset_index()
ylims.columns = ['country', 'ylim']
# filter predictions that exceed ylims
predictionsDF_filtered = (predictionsDF.merge(ylims, on='country', how='left').loc[lambda x: x.predictions <= x.ylim])
# compute 33% daily growth rate for reference
first_case_count = df.groupby('country').first().confirmed.reset_index()
date_anchor = predictionsDF_filtered[['country', 'days_since_100']].drop_duplicates()
max_pred = predictionsDF_filtered.groupby('country').max()[['predictions']].reset_index()
benchmark = (date_anchor
.merge(first_case_count, on='country', how='left')
.merge(max_pred, on='country', how='left'))
benchmark['benchmark'] = benchmark.apply(lambda x: x.confirmed * (1.3 ** (x.days_since_100)), axis=1)
benchmarkDF_filtered = benchmark.loc[lambda x: x.benchmark <= x.predictions]
# compute last known confirmed case
lastpointDF = df.groupby('country').last().reset_index()
# DF of chart titles. Hack for Altiar to switch values
titleDF = lastpointDF[['country']]
titleDF['title'] = titleDF.apply(lambda x: x.country + ': Actual vs Predicted Growth', axis=1)
european_countries = ['Italy', 'Spain', 'United Kingdom (total)', 'France (total)', 'Portugal']
asian_countries = ['Korea, South', 'Japan', 'China (total)', 'Singapore']
country_groups = [european_countries, asian_countries]
line_styles = ['-', ':', '--', '-.']
fig, axs = plt.subplots(nrows=len(country_groups), figsize=(8, 16), sharex=False)
for ax, country_group in zip(axs, country_groups):
for i, country in enumerate(countries):
if country in country_group:
sns.distplot((trace['b_ind'][:, i] * 100) - 100, ax=ax, label=country, hist=False)
display(f"Country Mean Growth - {country}:{np.mean((trace['b_ind'][:, i] * 100) - 100)}")
ax.axvline(33, ls='--', color='k', label='33% daily growth')
ax.legend()
ax.set_xlabel('Daily Growth in %')
plt.suptitle('Posterior of daily growth')
# Charts Linear Scale
sub_countries = ['Italy', 'Spain', 'United Kingdom (total)', 'France (total)', 'Portugal', 'Korea, South']
fig, axs = plt.subplots(nrows=(len(sub_countries) // 3), ncols=3, figsize=(30, 30), sharex=True)
for ax, country in zip(axs.flatten(), sub_countries):
df_country = df.loc[lambda x: x.country == country]
ax.plot(df_country.days_since_100, df_country.confirmed, color='r')
ax.plot(np.arange(0, post_pred[country].shape[1]), post_pred[country].T, alpha=.05, color='.5')
ax.plot(df_country.days_since_100, df_country.confirmed, color='r')
ax.set_ylim(0, df_country.confirmed.iloc[-1] * 15)
ax.set_title(country)
axs[0, 0].legend(['data', 'model prediction'])
[ax.set(xlabel='Days since 100 cases') for ax in axs[-1, :]]
[ax.set(ylabel='Confirmed cases') for ax in axs[:, 0]]
plt.show()
# Charts Log Scale
sub_countries = ['Italy', 'Spain', 'United Kingdom (total)', 'France (total)', 'Portugal', 'Korea, South']
fig, axs = plt.subplots(nrows=(len(sub_countries) // 3), ncols=3, figsize=(30, 30), sharex=True)
for ax, country in zip(axs.flatten(), sub_countries):
df_country = df.loc[lambda x: x.country == country]
ax.plot(df_country.days_since_100, df_country.confirmed, color='r')
ax.plot(np.arange(0, post_pred[country].shape[1]), post_pred[country].T, alpha=.05, color='.5')
ax.plot(df_country.days_since_100, df_country.confirmed, color='r')
ax.set_yscale('log')
ax.get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
ax.set_title(country)
axs[0, 0].legend(['data', 'model prediction'])
[ax.set(xlabel='Days since 100 cases') for ax in axs[-1, :]]
[ax.set(ylabel='Confirmed cases') for ax in axs[:, 0]]
plt.show()
az.plot_trace(trace, compact=True)